Read in data
cellsdata = Read10X("/Users/tfriedrich/Downloads/FC_102418_with_markers/outs/filtered_feature_bc_matrix/")
cells = CreateSeuratObject(counts = cellsdata, project = "FC", min.cells = 3, min.features = 200)
Number of each cell showing expression for each marker.
length(which(cellsdata["EGFP.dna",]>0))
## [1] 446
length(which(cellsdata["lacZ.dna",]>0))
## [1] 9
length(which(cellsdata["mCerulean.dna",]>0))
## [1] 59
length(which(cellsdata["mCherry.dna",]>0))
## [1] 7
length(which(cellsdata["mOrange.dna",]>0))
## [1] 14
length(which(cellsdata["tdrfp.dna",]>0))
## [1] 496
Write cells expressing markers of interest to a file.
A = subset(subset(cells, lacZ.dna>0) ,Xist>0)
B = subset(subset(cells, lacZ.dna==0) ,Xist>0)
C = subset(subset(cells, tdrfp.dna>0) ,Xist=0)
tdrfp = subset(cells, tdrfp.dna>0)
lacZ = subset(cells, lacZ.dna>0)
mOrange = subset(cells, mOrange.dna>0)
mCerulean = subset(cells, mCerulean.dna>0)
mCherry = subset(cells, mCherry.dna>0)
write.csv(x = GetAssayData(A), file = "lacZ_positive_and_Xist_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(B), file = "lacZ_negative_and_Xist_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(C), file = "tdrfp_positve_and_Xist_negative.csv", quote=FALSE)
write.csv(x = GetAssayData(tdrfp), file = "tdrfp_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(lacZ), file = "lacZ_positive.csv", quote=FALSE)
write.csv(x = GetAssayData(mOrange), file = "mOrange_positve.csv", quote=FALSE)
write.csv(x = GetAssayData(mCerulean), file = "mCerulean_positve.csv", quote=FALSE)
write.csv(x = GetAssayData(mCherry), file = "mCherry_positve.csv", quote=FALSE)
Gather scRNA-seq QC metric.
mito.genes <- grep(pattern = "^mt-", x = rownames(x = cells), value = TRUE)
percent.mito <- Matrix::colSums(cells[mito.genes, ])/Matrix::colSums(cells)
AddMetaData adds columns to object@meta.data, and is a great place to stash QC stats
cells <- AddMetaData(object = cells, metadata = percent.mito, col.name = "percent.mito")
Plot expressino of marker genes
VlnPlot(object = cells, features = c("nFeature_RNA", "nCount_RNA", "percent.mito"))
marker_genes = c("Hnf4a", "Alb", "Fah", "Krt8", "Krt18", "G6pc", "Sox9", "Krt7", "Krt19", "Epcam", "Onecut1", "Afp", "Xist")
VlnPlot(A,c("Hnf4a", "Alb", "Fah"))
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE],
## idents = idents, : All cells have the same value of Alb.
## Warning in SingleExIPlot(type = type, data = data[, x, drop = FALSE],
## idents = idents, : All cells have the same value of Fah.
VlnPlot(B,c("Hnf4a", "Alb", "Fah"))
VlnPlot(C,c("Hnf4a", "Alb", "Fah"))
cells <- NormalizeData(object = cells, normalization.method = "LogNormalize",
scale.factor = 10000)
vst: First, fits a line to the relationship of log(variance) and log(mean) using local polynomial regression (loess). Then standardizes the feature values using the observed mean and expected variance (given by the fitted line). Feature variance is then calculated on the standardized values after clipping to a maximum (see clip.max parameter).
cells <- FindVariableFeatures(object = cells, selection.method = "vst", nfeatures = 2000)
Regress out expression effects confounded by nUMI and percent.mito
cells <- ScaleData(object = cells, vars.to.regress = c("nUMI", "percent.mito"))
## Regressing out nUMI, percent.mito
## Centering and scaling data matrix
Regress out cell cycle genes. For each gene, Seurat models the relationship between gene expression and the S and G2M cell cycle scores. The scaled residuals of this model represent a ‘corrected’ expression matrix, that can be used downstream for dimensional reduction.
s.genes <- cc.genes$s.genes
g2m.genes <- cc.genes$g2m.genes
cells <- CellCycleScoring(object=cells, s.features = s.genes, g2m.features = g2m.genes, set.ident = TRUE)
## Warning in AddModuleScore(object = object, features = features, name =
## name, : Could not find enough features in the object from the following
## feature lists: S.Score Attempting to match case...Could not find enough
## features in the object from the following feature lists: G2M.Score
## Attempting to match case...
cells <- RunPCA(object = cells, pc.genes = cells@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5)
## PC_ 1
## Positive: Igfbp7, Spp1, Cp, Xist, Trf, Lcn2, Gc, C3, Ifitm3, Tmem176b
## Mgst1, Stmn1, Hmgn2, Tesc, Tceal9, Hpn, Selenoh, Cdk4, Tmsb10, Ptma
## Hnrnpa1, Slc25a4, Tmem176a, Npm1, Hmgn1, B2m, Nme1, Cald1, Snhg18, Cxcl5
## Negative: Lgals4, Plac8, Phgr1, Ctse, Gsta1, Gcnt3, Gm3336, Tff3, Psca, Spink4
## Lgals3, Cyp2s1, Msln, Txnip, Ly6a, AA467197, Duox2, Gsto1, 2210407C18Rik, Duoxa2
## Crip1, Cyp3a13, Fkbp5, Krt20, Cbr1, Gm10260, Ahnak, S100a14, Akr1b7, Rbp2
## PC_ 2
## Positive: Wfdc2, Birc5, Hmgb2, Cdca8, Ccnb2, Pclaf, Ccna2, Cdca3, Cdk1, Top2a
## Cenpa, Nusap1, Ube2c, Gpx2, Pbk, Cenpf, Racgap1, Esco2, Aurkb, Ccnb1
## Spc24, Hist1h2ap, Cdc20, Car2, Pimreg, Prc1, Smc2, Cks2, Sftpd, Knstrn
## Negative: Ccdc80, Aebp1, Col1a2, Sparc, Vim, Axl, Serping1, Lhfp, Col3a1, Bgn
## Ptgis, Nnmt, Rcn3, Loxl1, Col1a1, Lpl, Emilin1, Mxra8, Col5a2, Dcn
## Lgals1, Ak1, Gpx8, Fstl1, Col6a1, Colec12, Lrp1, Lbh, Gm12840, Cygb
## PC_ 3
## Positive: Alb, Fga, Cfi, Agt, Itih2, Ttc36, Lrg1, Tns1, St3gal5, Cp
## C3, Hpn, Fgg, Azgp1, Nrn1, Pah, Osgin1, Fgb, Lcn2, Ambp
## Spp1, Gstm1, Dcdc2a, Ces1d, Trf, Gas6, Slc5a1, Scn1b, Mgst1, Tmem176b
## Negative: Birc5, Pclaf, Pbk, Ccna2, Cdca8, Cdca3, Cdk1, Ube2c, Aurkb, Top2a
## Hist1h2ap, Nusap1, Spc24, Ccnb1, Mki67, Racgap1, Cenpa, Cdc20, Smc2, Ccnb2
## Tk1, Cenpf, Pimreg, Prc1, Esco2, Spc25, Cenpm, Plk1, Hmmr, Cdkn3
## PC_ 4
## Positive: Wfdc17, Lyz2, Fcer1g, Ccl6, Tyrobp, Ctss, Cd68, Apoe, Fcgr3, Laptm5
## Ms4a6d, Trem2, Cd53, C1qb, C1qa, Mpeg1, C1qc, Plek, Pla2g7, Gpnmb
## Selenop, Fcgr2b, Pirb, Cybb, C5ar1, Lypd8, Lilr4b, Clec4d, Pmp22, Rac2
## Negative: Ccnd1, Tnfrsf12a, Basp1, Krt7, Wfdc2, Plaur, Tubb2b, Lor, Sprr1a, Phgdh
## Cd44, Tpd52l1, Krt23, Hspb1, Serpinb6b, Plat, Cdh13, Cxcl16, Ctgf, Tacstd2
## Anxa1, Cryab, Edn1, Gprc5a, Cldn4, Cdk6, Pbx1, Msln, Phlda1, Nupr1
## PC_ 5
## Positive: Lyz2, Fcer1g, Wfdc17, Tyrobp, Ctss, Ccl6, Fcgr3, Laptm5, Ms4a6d, Trem2
## Plek, Pla2g7, Cd53, Adam8, Gpnmb, C1qb, Pirb, C1qc, Mpeg1, Fcgr2b
## C5ar1, C1qa, Lilr4b, Clec4d, Cd84, Rac2, Cybb, Pmp22, Rgs1, Csf1r
## Negative: Ttr, Tff3, Lypd8, Ambp, AA467197, 2210407C18Rik, Akr1b7, Lrg1, Spink4, Fga
## Dmbt1, Upp1, Alb, Sptssb, Pigr, Fcgbp, Clca1, Sult1d1, Pah, Cyp2c65
## Gm3336, Sult1c2, Cfi, Lgals4, Serping1, Fgg, AC102496.1, Ret, Snhg18, Txnip
ElbowPlot(cells)
Highly variable genes.
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(cells), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(cells)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## When using repel, set xnudge and ynudge to 0 for optimal results
plot2
Visualize using UMAP. Colored by cell state.
cells <- RunUMAP(object = cells, dims = 1:10)
#pdf("FeaturePlots_FC.pdf")
DimPlot(object = cells, reduction = "umap")
FeaturePlot(cells, features = marker_genes[1:4])
FeaturePlot(cells, features = marker_genes[5:8])
FeaturePlot(cells, features = marker_genes[9:12])
FeaturePlot(cells, features = marker_genes[13])
FeaturePlot(cells, features = c("EGFP.dna","lacZ.dna","mCerulean.dna","mCherry.dna"))
FeaturePlot(cells, features = c("mCherry.dna","mOrange.dna","tdrfp.dna"))
#dev.off()
#Extract data, phenotype data, and feature data from the SeuratObject
#https://github.com/cole-trapnell-lab/monocle-release/issues/262
data <- as(as.matrix(cells@assays$RNA@data), 'sparseMatrix')
pd <- new('AnnotatedDataFrame', data = cells@meta.data)
fData <- data.frame(gene_short_name = row.names(data), row.names = row.names(data))
fd <- new('AnnotatedDataFrame', data = fData)
monocle_cds <- newCellDataSet(data,
phenoData = pd,
featureData = fd,
lowerDetectionLimit = 0.5,
expressionFamily = negbinomial.size())
# generate size factors for normalization later
monocle_cds <- estimateSizeFactors(monocle_cds)
# markers for common cell types
marker_file_path <- "/Users/tfriedrich/Downloads/markers_garnett.txt"
marker_check <- check_markers(monocle_cds, marker_file_path,
db=org.Mm.eg.db,
cds_gene_id_type = "SYMBOL",
marker_file_gene_id_type = "SYMBOL")
## There are 3 cell type definitions
plot_markers(marker_check)
## Warning: Removed 5 rows containing missing values (geom_point).
## Warning: Removed 5 rows containing missing values (geom_text_repel).
# train cells using marker genes as labels Garnett classification is trained using a multinomial elastic-net regression. This means that certain genes are chosen as the relevant genes for distinguishing between cell types. Which genes are chosen may be of interest, so Garnett includes a function to access the chosen genes. Note: Garnett does not regularize the input markers, so they will be included in the classifier regardless.
set.seed(260)
pbmc_classifier <- train_cell_classifier(cds = monocle_cds,
marker_file = marker_file_path,
db=org.Mm.eg.db,
cds_gene_id_type = "SYMBOL",
num_unknown = 50,
marker_file_gene_id_type = "SYMBOL")
## There are 3 cell type definitions
## Warning in make_name_map(parse_list,
## as.character(row.names(fData(norm_cds))), : 5 genes could not be converted
## from SYMBOL to ENSEMBL These genes are listed below:
## Warning in make_name_map(parse_list, as.character(row.names(fData(norm_cds))), : The following genes from the cell type definition file are notpresent in the cell dataset. Please check these genes for errors.Cell type determination will continue, ignoring these genes.
## Opn
## Cypa1
## Cyp3a4
## Aat
## Cyp3a7
## training_sample
## cholangiocyte hepatoblast hepatocyte Unknown
## 500 8 500 50
## Model training finished.
head(pData(monocle_cds))
## orig.ident nCount_RNA nFeature_RNA percent.mito
## AAACCTGAGACTGGGT FC 12250 3112 0.009061224
## AAACCTGAGCACACAG FC 47677 6523 0.012165195
## AAACCTGAGTTAACGA FC 14352 3506 0.005643813
## AAACCTGCAAGGTGTG FC 27491 5257 0.013822706
## AAACCTGCATGATCCA FC 18650 3994 0.006702413
## AAACCTGCATGCCACG FC 53461 6647 0.009726717
## S.Score G2M.Score Phase old.ident Size_Factor
## AAACCTGAGACTGGGT 9.873166e-05 -0.05839754 S FC 1.2089628
## AAACCTGAGCACACAG -3.734501e-02 -0.04636801 G1 FC 0.9440832
## AAACCTGAGTTAACGA -2.359718e-02 -0.07000838 G1 FC 1.3806441
## AAACCTGCAAGGTGTG -2.649230e-02 -0.07620632 G1 FC 1.1419254
## AAACCTGCATGATCCA -3.156153e-02 -0.02979146 G1 FC 0.8796619
## AAACCTGCATGCCACG 2.837445e-01 0.51890682 G2M FC 0.9898649
feature_genes <- get_feature_genes(pbmc_classifier,
node = "root",
db = org.Mm.eg.db,
convert_ids = TRUE)
head(feature_genes)
## cholangiocyte hepatoblast hepatocyte Unknown
## (Intercept) 1.77910158 -4.430007e+00 1.180655465 1.4702498367
## Imp4 -0.02799056 -6.411412e-05 -0.002402989 0.0304576677
## Plekhb2 0.02290446 -1.202628e-03 -0.021853171 0.0001513424
## Txndc9 0.02469316 3.076515e-03 0.067396988 -0.0951666603
## Pdcl3 -0.03381068 1.768482e-03 -0.002048701 0.0340908992
## Cavin2 0.01853130 -3.062922e-04 0.018786126 -0.0370111352
write.table(feature_genes, "/Users/tfriedrich/Downloads/garnett_elastic_regression_feature_genes.xls", sep="\t", col.names=NA)
DE <- FindMarkers(cells, ident.1 = "G1", ident.2 = "S")